In [28]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sys
import plotly.express as px
import os
import seaborn as sns
import json
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
import plotly.offline as pyo
pyo.init_notebook_mode() 
sys.tracebacklimit = 0 # turn off the error tracebacks
In [29]:
# Problem 1 - Get user agent and make headers dictionary
r = requests.get('https://httpbin.org/user-agent')
useragent = json.loads(r.text)['user-agent']
headers = {'User-Agent': 'data science passion project (vrd9sd@virginia.edu) (Language=Python 3.8.2; platform=Mac OSX 13.0.1'}

# Url to get HTML from 
url = 'https://waterdata.usgs.gov/va/nwis/current/?type=gw'
r = requests.get(url, headers=headers)

# Parsing HTML code 
mysoup = BeautifulSoup(r.text, 'html.parser')
In [30]:
# make empty list of counties
counties=[]
# populate county list with one instance for each site number under that county
for td in mysoup.find_all('td'):
    if 'colspan' in td.attrs:
        county_name = td.strong.text
        current_county = county_name
    elif td.find('a') and td.find('a').get('href', '').startswith('/va/nwis/uv'):
        counties.append(current_county[1:])
In [31]:
station_number = [x.string for x in mysoup.find_all(lambda tag: tag.has_attr('href') and tag['href'].startswith('/va/nwis/uv'))]
In [32]:
date_and_site = [x.string for x in mysoup.find_all('td', attrs = {'nowrap':'nowrap'})]
dates = [item[1:-1] for i, item in enumerate(date_and_site) if i%2==1]
site_name = [item[1:-1] for i, item in enumerate(date_and_site) if i%2==0]

water_depth_strings = [x.find_all('td')[3].string for x in mysoup.find_all('tr', attrs = {'align':'right'})]
water_depths = [item[:-1] if item != None else np.nan for item in water_depth_strings]

Making County names consistent with Fips Database¶

In [33]:
counties =  [x.replace('Of', 'of') for x in counties] # uncapitalize 'Of' in places
counties = [x.replace('And', 'and') for x in counties] # uncapitalize 'And' for places
counties = [x[:-4] + 'city' if x.endswith('City') else x for x in counties] # uncapitalize 'City' in places
In [34]:
groundwater_data = pd.DataFrame({
    'Jurisdiction': counties,
    'station number': station_number, 
    'dates': dates,
    'site_name': site_name,
    'water table depth': water_depths})

Outputting to CSV File¶

In [35]:
file_path = 'water_table_depths.csv'
if not os.path.exists(file_path):
    groundwater_data.to_csv(file_path, index=False)
else:
    groundwater_data.to_csv(file_path, mode='a', header=False, index=False)

Make the date current eastern time datetime object¶

In [36]:
# from datetime import datetime
# from zoneinfo import ZoneInfo
# edt_timezone = ZoneInfo('US/Eastern')
# datetime_objects = []
# for date in dates:
#     date_string = date.replace(' EDT', '') # strip edt since we set 
#     date_object = datetime.strptime(date_string, "%m/%d %H:%M") # parse into datetime object
#     date_object = date_object.replace(year=datetime.now().year) # Add current year
#     date_object = date_object.astimezone(edt_timezone) # localize to edt timezone
#     datetime_objects.append(date_object) # append to list

# print(datetime_objects[-1])

Merging Fips-County data with groundwater_data¶

In [37]:
data = pd.read_csv('vacounties.csv') # read in vacounties data 
va_fips = data.iloc[:, :2] #isolate county and FIPS columns
In [38]:
# Come back to this to merge correctly (counties named city turns into error)
merged_data = pd.merge(va_fips, groundwater_data, on='Jurisdiction', how='right', indicator=True) # merge groundwater data with counties
In [39]:
merged_data.query("_merge!='both'") # Identify Counties that dont have match in right df Should be empty
Out[39]:
FIPS Jurisdiction station number dates site_name water table depth _merge
In [40]:
merged_data['water table depth'] = merged_data['water table depth'].astype('float')
In [41]:
grouped = merged_data.groupby('Jurisdiction').agg(
    mean_county_depth=('water table depth', 'mean'),
    median_county_depth = ('water table depth', 'median'),
    num_county_stations = ('water table depth', 'size'),
    min_county_depth = ('water table depth', 'min'),
    max_county_depth = ('water table depth', 'max'),
    q25_county_depth = ('water table depth', lambda x: np.percentile(x, 75)),
    q75_county_depth = ('water table depth', lambda x: np.percentile(x, 75)),
    ).reset_index()
grouped.head()
Out[41]:
Jurisdiction mean_county_depth median_county_depth num_county_stations min_county_depth max_county_depth q25_county_depth q75_county_depth
0 Accomack County 47.454167 41.540 12 9.30 101.23 75.0800 75.0800
1 Appomattox County -0.580000 -0.580 1 -0.58 -0.58 -0.5800 -0.5800
2 Augusta County 15.600000 15.600 1 15.60 15.60 15.6000 15.6000
3 Bath County 5.860000 5.860 1 5.86 5.86 5.8600 5.8600
4 Bedford County 16.932500 12.265 4 6.10 37.10 20.0675 20.0675
In [42]:
merged_with_stats = pd.merge(merged_data, grouped, on='Jurisdiction', how='left')
merged_with_stats.head()
Out[42]:
FIPS Jurisdiction station number dates site_name water table depth _merge mean_county_depth median_county_depth num_county_stations min_county_depth max_county_depth q25_county_depth q75_county_depth
0 51001 Accomack County 374425075400001 10/02 17:00 EDT 65K 27 SOW 114A 50.74 both 47.454167 41.54 12 9.3 101.23 75.08 75.08
1 51001 Accomack County 374425075400002 10/02 17:00 EDT 65K 28 SOW 114B 79.46 both 47.454167 41.54 12 9.3 101.23 75.08 75.08
2 51001 Accomack County 374425075400003 10/02 17:00 EDT 65K 29 SOW 114C 101.23 both 47.454167 41.54 12 9.3 101.23 75.08 75.08
3 51001 Accomack County 375315075332101 10/02 16:30 EDT 66M 57 76.16 both 47.454167 41.54 12 9.3 101.23 75.08 75.08
4 51001 Accomack County 375315075332102 10/02 16:30 EDT 66M 58 69.47 both 47.454167 41.54 12 9.3 101.23 75.08 75.08

Look at distribution of groundwater depths¶

In [43]:
sns.histplot(merged_with_stats['water table depth'], bins=9)
plt.xlabel('Water Table Depth (ft)')
plt.ylabel('Frequency')
plt.title('VA Water Table Depths at USGS monitoring locations')
plt.show()
No description has been provided for this image

Making Chloropleth of water table depths by counties¶

In [44]:
# Get geojson for county fips
import json
url = 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
r = requests.get(url)
geojson_counties = json.loads(r.text)
In [45]:
# Showing Virginia choropleth for FIPS that are in merged groundwater_data
fig = px.choropleth(merged_with_stats,
                    geojson=geojson_counties,
                    color='num_county_stations',
                    scope='usa',
                    locations='FIPS',
                    hover_name = 'Jurisdiction',
                    color_continuous_scale = px.colors.sequential.Greys, # [::-1], # reverses the color scale
                    hover_data = ['num_county_stations', 'mean_county_depth'],
                    title = 'VA Water Table Depth monitoring stations by County Vizualization'
                                  )
fig.update_geos(fitbounds='locations')
fig.show()
In [46]:
# remove counties with few monitoring stations
merged_with_stats = merged_with_stats[merged_with_stats['num_county_stations'] > 2]
In [47]:
fig2 = px.choropleth(merged_with_stats,
                    geojson=geojson_counties,
                    color='mean_county_depth',
                    scope='usa',
                    locations='FIPS',
                    hover_name = 'Jurisdiction',
                    color_continuous_scale = px.colors.sequential.PuBu, #[::-1], # reverses the color scale
                    hover_data = [ 'mean_county_depth', 'min_county_depth', 'max_county_depth','num_county_stations'],
                    title = 'Mean water table depths by VA County (> 2 stations)'
                                  )
fig2.update_geos(fitbounds='locations')
fig2.show()
In [48]:
import plotly.io as pio
pio.write_html(fig, 'choropleth_mean_depth.html')
pio.write_html(fig2, 'choropleth_num_stations.html')
In [49]:
from IPython.display import IFrame

# Embed the first figure
IFrame('choropleth_mean_depth.html', width=800, height=600)

# Embed the second figure
IFrame('choropleth_num_stations.html', width=800, height=600)
Out[49]: